Notebook Structure

  1. Importing Required Libraries
  2. Loading Data
  3. Missing Values and Outliers
  4. Data Exploration

    a. Budget distribution by Service

    b. How has the budget distribution changed over the years?

    c. Which department gets the most money?

    d. Exploring Department column

Importing Required Libraries

# loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud

import warnings
warnings.filterwarnings("ignore")

Loading data

# reading the data
df = pd.read_csv('./data/sf_budget_2022-25_cleaned.csv')
df.shape
(209, 8)
df.head()
service_area department department_description division link budget_2022_23 budget_2023_24 budget_2024_25
0 Culture & Recreation Academy Of Sciences A science museum - and scientific and educatio... Academy of Sciences https://www.calacademy.org/ 7422345 7460485 7329391
1 Public Protection Adult Probation Aims to rehabilitate adult offenders to promot... Adult Probation https://sf.gov/departments/adult-probation-dep... 58036486 58116740 57581502
2 Public Works, Transportation & Commerce Airport SFO is the Bay Area’s largest airport by passe... Airport Director https://www.flysfo.com/about/airport-commission 9023494 9142274 9468694
3 Public Works, Transportation & Commerce Airport SFO is the Bay Area’s largest airport by passe... Bureau Of Admin & Policy https://www.flysfo.com/about/airport-commission 29574462 29570940 31846599
4 Public Works, Transportation & Commerce Airport SFO is the Bay Area’s largest airport by passe... Capital Projects https://www.flysfo.com/about/airport-commission 91229001 53385000 53385000
# overview of the data file
def data_overview(df):
    # Initialize an empty DataFrame
    info_df = pd.DataFrame(index=df.columns)
    
    # Add columns to the info DataFrame
    info_df['DataType'] = df.dtypes
    info_df['Count'] = df.count()
    info_df['Missing Values'] = df.isnull().sum()
    info_df['Unique Values'] = df.nunique()
    
    # Calculate min and max values only for integer columns
    int_columns = df.select_dtypes(include='int').columns
    info_df.loc[int_columns, 'Min Value'] = df[int_columns].min()
    info_df.loc[int_columns, 'Max Value'] = df[int_columns].max()
    
    # Use a lambda function to apply value_counts to each column
#     info_df['Value Counts'] = df.apply(lambda x: x.value_counts().to_dict())
    
    return info_df
data_overview(df)
DataType Count Missing Values Unique Values Min Value Max Value
service_area object 209 0 8 NaN NaN
department object 209 0 56 NaN NaN
department_description object 209 0 54 NaN NaN
division object 209 0 181 NaN NaN
link object 205 4 125 NaN NaN
budget_2022_23 int64 209 0 207 -2.605628e+09 1.855331e+09
budget_2023_24 int64 209 0 203 -2.667358e+09 1.830274e+09
budget_2024_25 int64 209 0 204 -2.534256e+09 1.529163e+09

Missing values or Outliers

# Make sure 'budget2022', 'budget2023', 'budget2024' are numerical columns
df_melted = pd.melt(df, id_vars=None, value_vars=['budget_2022_23', 'budget_2023_24', 'budget_2024_25'],
                    var_name='Budget Year', value_name='Budget')

# Plot the box plot using Plotly
fig = px.box(df_melted, x='Budget Year', y='Budget',
             labels={'Budget': 'Budget'},
             title='Box Plot of Budget by Year',
             height=500)
fig.update_layout(showlegend=False)  # No need for legend in this case
fig.show()
# Melt the DataFrame
df_melted = pd.melt(df, id_vars=None, value_vars=['budget_2022_23', 'budget_2023_24', 'budget_2024_25'],
                    var_name='Budget Year', value_name='Budget')

# Calculate whiskers for outlier removal
Q1 = df_melted['Budget'].quantile(0.25)
Q3 = df_melted['Budget'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df_no_outliers = df_melted[(df_melted['Budget'] >= lower_bound) & (df_melted['Budget'] <= upper_bound)]

# Plot the box plot without outliers
fig = px.box(df_no_outliers, x='Budget Year', y='Budget',
             points="all",  # Show all points (not just outliers)
             labels={'Budget': 'Budget'},
             title='Box Plot of Budget by Year (Without Outliers)',
             height=500)
fig.update_layout(showlegend=False)  # No need for legend in this case
fig.show()

Data Exploration

Budget distribution by Service

# Looking at budget 2022-2023 distribution across service_area
fig = px.bar(df, x='service_area', y='budget_2022_23', color='service_area', title='Budget Distribution by Service',
             labels={'budget_2022_23': 'Budget', 'service_area': 'Service'})

fig.update_layout(xaxis_title='Service', yaxis_title='Budget', barmode='stack')

fig.show()
# Notes and updates - 
# why is there that one negative value? inference?
# Explore if we can add a drop down for the budget columns to easily switch between the years
# too much information. can we make the category names smaller?

How has the budget distribution changed over the years?

df.columns
Index(['service_area', 'department', 'department_description', 'division',
       'link', 'budget_2022_23', 'budget_2023_24', 'budget_2024_25'],
      dtype='object')
# Make sure 'budget2022', 'budget2023', 'budget2024' are numerical columns
df['change_22_23_to_23_24'] = df['budget_2023_24'] - df['budget_2022_23']
df['change_23_24_to_24_25'] = df['budget_2024_25'] - df['budget_2023_24']

# Plot the change using Plotly
fig = px.bar(df, x='service_area', y=['change_22_23_to_23_24', 'change_23_24_to_24_25'],
             labels={'value': 'Budget Change'},
             title='Budget Change from 22-23 to 23-24 and from 23-24 to 24-25 by Service',
             height=500)
fig.update_layout(barmode='group')
fig.show()
# this chart needs to be improved as the calculations are per row 
# instead of a groupby.

Which department gets the most money?

# top 15 departments (out of 56) by budget in a horizontal bar chart

# calculating total budget at department level
df_grouped = df.groupby('department')['budget_2022_23'].sum().reset_index()
df_grouped = df_grouped.sort_values(by='budget_2022_23', ascending=False).head(15)  # Displaying top 10 categories

fig = px.bar(df_grouped, x='budget_2022_23', y='department', orientation='h',
             title='Top Departments by Budget',
             labels={'budget': 'budget_2022_23', 'department': 'Department'},
             )

fig.update_layout(xaxis_title='Budget', yaxis_title='Department', barmode='stack')

fig.show()
# Notes:
# out of the total 14.1 billion, 3 billion is in public health
# followed by general city responsibility and public utilities

Exploring Department column

text = ' '.join(df['department'].dropna().astype(str))

# Create and generate a word cloud image
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    contour_width=3,
    contour_color='steelblue',
    colormap='viridis',  # You can change the color map
    max_words=200  # Adjust the number of words displayed
).generate(text)

# Save the word cloud as an image with higher DPI
wordcloud.to_file("wordcloud.png")

# Display the interactive version using Plotly
fig = px.imshow(wordcloud.to_array(), binary_string=True, height=400, width=800)
fig.update_layout(coloraxis_showscale=False)
fig.show()
# notes-
# we can explore service, airport, numicipal, transportation separately
# can we make this plot with budget as the metric instead of value counts?

Second dataset

import chardet

with open('./data/addbacksSpreadsheet_cleaned.csv', 'rb') as f:
    result = chardet.detect(f.read())

df2 = pd.read_csv('./data/addbacksSpreadsheet_cleaned.csv', encoding=result['encoding'])
df2 = pd.read_csv('./data/addbacksSpreadsheet_cleaned.csv')